Libraries

library(readxl)
library(DataExplorer)
library(reshape2)
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(EnvStats)
## 
## Attaching package: 'EnvStats'
## The following objects are masked from 'package:stats':
## 
##     predict, predict.lm
## The following object is masked from 'package:base':
## 
##     print.default
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.8     ✔ purrr   0.3.4
## ✔ tidyr   1.2.0     ✔ stringr 1.4.1
## ✔ readr   2.1.2     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ gridExtra::combine()     masks dplyr::combine()
## ✖ lubridate::date()        masks base::date()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ lubridate::intersect()   masks base::intersect()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ lubridate::setdiff()     masks base::setdiff()
## ✖ lubridate::union()       masks base::union()
library(rospca)
library(knitr)

#Data Cleaning \(Note:\) This is the first code to be run for this analysis.

Data Cleaning

#Using 'Perchlorate codebook.pdf' as data dictionary

### Additional Notes from Mark Remiker, August 2022 ###
#Num_in_household 
#-  No data entry error here - that’s the response!  
#Num_children
#-  Same!  That’s the actual response.  
#Armed_forces
#-  1 = Yes, 2 = No, 3 = Don’t know, 4 = Refuse
#-  61 = Yes, 62 = No, 63 = Don’t know, 64 = Refuse
unique(Metals$Armed_forces)
## [1]  1 62  2 NA
Metals$Armed_forces[Metals$Armed_forces==62] <- 2
unique(Metals$Armed_forces)
## [1]  1  2 NA
#Employment
#-  1 = Working for pay, 2= Self-employed, 3= Looking for work, 4= Temporarily laid off, 5 = Retired, 6= A homemaker, 7= Student, 8= Permanently disabled, 9= Maternity/sick leave, 10= Other, 11= Don’t know, 12 = Refused
#Health_care
#-  21= Yes, 22= No, 24= I don’t know, 23= Refuse
unique(Metals$Health_care)
## [1]  1 21  2 22  3  4 NA
Metals$Health_care[Metals$Health_care==21] <- 1
Metals$Health_care[Metals$Health_care==22] <- 2
unique(Metals$Health_care)
## [1]  1  2  3  4 NA
#Health_consult
#-  21= Yes, 22= No, 24= I don’t know, 23= Refuse
unique(Metals$Health_consult)
##  [1]  1 31  3  2  4 33 32 34 35 NA
Metals$Health_consult[Metals$Health_consult==31] <- 1
Metals$Health_consult[Metals$Health_consult==32] <- 2
Metals$Health_consult[Metals$Health_consult==33] <- 3
Metals$Health_consult[Metals$Health_consult==34] <- 4
Metals$Health_consult[Metals$Health_consult==35] <- 5
unique(Metals$Health_consult)
## [1]  1  3  2  4  5 NA
#Health_consult_place
#-  Blank and “.”  - Missing data
#Health options/Medical conditions
#-  1= Yes, 2= No, 3= Don’t know, 4= Refused 
#HTN, HTN_hx_meds, Cholesterol, Cholesterol_meds
#-  5= Yes, 6= No, 7= Don’t know 
unique(Metals$HTN)
## [1]  1  6  5  2  3  7 NA
Metals$HTN[Metals$HTN==5] <- 1
Metals$HTN[Metals$HTN==6] <- 2
Metals$HTN[Metals$HTN==7] <- 3
unique(Metals$HTN)
## [1]  1  2  3 NA
unique(Metals$HTN_hx_meds)
## [1] "1" "." "5" "6" "2" NA
Metals$HTN_hx_meds[Metals$HTN_hx_meds==5] <- 1
Metals$HTN_hx_meds[Metals$HTN_hx_meds==6] <- 2
unique(Metals$HTN_hx_meds)
## [1] "1" "." "2" NA
unique(Metals$Cholesterol)
## [1]  1  6  5  2  7  3 NA
Metals$Cholesterol[Metals$Cholesterol==5] <- 1
Metals$Cholesterol[Metals$Cholesterol==6] <- 2
Metals$Cholesterol[Metals$Cholesterol==7] <- 3
unique(Metals$Cholesterol)
## [1]  1  2  3 NA
#Diabetes
#-  31= Yes, 32= No, 33= Prediabetes 35= Don't know
unique(Metals$Diabetes)
## [1]  2 32 31  1  3 33 35 NA
Metals$Diabetes[Metals$Diabetes==31] <- 1
Metals$Diabetes[Metals$Diabetes==32] <- 3
Metals$Diabetes[Metals$Diabetes==33] <- 2
Metals$Diabetes[Metals$Diabetes==35] <- 4
unique(Metals$Diabetes)
## [1]  2  3  1  4 NA
#Dm_age
#-  “.” = missing data, no response.
#Endo: does the participant have endocrinology data
#-  .00 = No, 1.00 = yes 
unique(Metals$endo)
## [1]  1  0 NA
#These are correct here.
#EMR
#-  0= No, 1= Yes

#Generally "." is missing data
Metals2 <- Metals %>% 
  select(-c(Date, DOB))

Metals2[Metals2=="."] <- NA

Metals3 <- Metals %>% 
  select(c(Date, DOB))

Metals <- cbind(Metals2, Metals3)

remove(Metals2)
remove(Metals3)
#Participant C1045 was 9/4/2018

Metals$Date[Metals$Date=="2013-01-04"] <- "2018-09-04"

Wondering About Recodes

Looks like this was converted to just 1=yes, 0=no.

An Analysis of Metals in Yuma County, Arizona and their Effects on Health Outcomes

This project will consider the associations between concentration levels of metals and metalloids measured in hair samples of 330 Yuma County residents in 2018 and the prevalence of negative health outcomes, including endocrine disruption, reproductive disorders, obesity, and neurological disorders.

Note: Before running this code, Data_Clean.Rmd should be run to clean the dataset used in this analysis.

Data Descriptions

#create_report(Metals) (Remove the # when you actually want to run - takes too much processing power otherwise.)

Data Visualization - Exploratory

Metal Concentrations

#The initial columns are for Mn55, Cu65, Cd111, Hg202, Pb208, and U238.
#There are additional columns with metal concentrations for arsenic and iron (As_75 and Fe_57) that come later after the hormone level columns.
#The differences in naming format make me think these were analyzed either at different times or on different machines or with different techniques, perhaps? 
#Going to make things uniform for posterity.
colnames(Metals)[colnames(Metals) == "As_75"] <- "As75"
colnames(Metals)[colnames(Metals) == "Fe_57"] <- "Fe57"

Melting Metals Together :)

data.melt <- melt(Metals,id.vars=c('ID','Date'), measure.vars=c('Mn55','Cu65','Cd111', 'Hg202', 'Pb208', 'U238', 'As75', 'Fe57'))

ggplot(data.melt) +
  geom_boxplot(aes(y=log(value), fill=variable))+
  theme(axis.ticks.x = element_blank(),
        axis.text.x = element_blank(),
        panel.background = element_rect(fill="#ffffff"),
        axis.line = element_line(color="#000000"))+
  ylab("Ln Concentration (ug/g)")+
  labs(fill='Metals')
## Warning in log(value): NaNs produced

## Warning in log(value): NaNs produced
## Warning: Removed 1432 rows containing non-finite values (stat_boxplot).

Concentrations by Date

#There is a single data point for the year 2013, with the rest being April-November 2018.
#This is likely either a mistake in input or some other error, but I will not include this point in subsequent analysis.
#The concentration values for this single row are blank anyway.
ggplot(data.melt, aes(y=log(value), x=Date, color=variable))+
  geom_point()+
  geom_smooth(se=F)+
  theme(panel.background = element_rect(fill="#ffffff"),
        axis.text.x=element_text(angle=60, hjust=1),
        axis.line = element_line(color="#000000"))+
  scale_x_date(date_breaks = "1 month", date_labels =  "%b")+
  ylab("Ln Concentration (Units)")+
  xlab("Date of Sample Collection, 2018")+
  labs(color='Metals')
## Warning in log(value): NaNs produced

## Warning in log(value): NaNs produced

## Warning in log(value): NaNs produced
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 1455 rows containing non-finite values (stat_smooth).
## Warning: Removed 1455 rows containing missing values (geom_point).

#Relevant paper: https://www.sciencedirect.com/science/article/pii/S1878535212000263 

Tables and Observations for Each Metal

data.melt %>% 
  group_by(variable) %>% 
  summarize(number_NAs=sum(is.na(value)))
## # A tibble: 8 × 2
##   variable number_NAs
##   <fct>         <int>
## 1 Mn55            123
## 2 Cu65            123
## 3 Cd111           123
## 4 Hg202           212
## 5 Pb208           123
## 6 U238            123
## 7 As75            283
## 8 Fe57            310
#Seeing if there are negative concentrations
data.melt %>% 
  group_by(variable) %>% 
  summarize(n_negative=sum(value<0, na.rm = T))
## # A tibble: 8 × 2
##   variable n_negative
##   <fct>         <int>
## 1 Mn55              0
## 2 Cu65              0
## 3 Cd111             0
## 4 Hg202             0
## 5 Pb208             0
## 6 U238              0
## 7 As75             12
## 8 Fe57              0
#There are twelve negative concentration levels of arsenic. This will mess with taking the lognormal to transform.
#Need to look up what to do when concentrations in a dataset are negative. Do you ignore? Do you replace with DL/sqrt(2)? Do you leave them as is and it's just tough luck?
#For now I will simply ignore!

#Making a table re: metal concentration values
data.melt$value <- as.numeric(data.melt$value)
knitr::kable(data.melt %>%
               filter(value>=0) %>% #This is ignoring negative concentrations in dataset, of which As75 has twelve.
               group_by(variable) %>% 
               summarize(n=sum(!is.na(value)),
                         geomean=round(geoMean(value, na.rm = T), digits = 2),
                         geosd=round(geoSD(value, na.rm = T), digits = 2)))
variable n geomean geosd
Mn55 207 0.97 4.10
Cu65 207 22.90 4.87
Cd111 207 0.07 2.18
Hg202 118 0.28 4.07
Pb208 207 0.48 3.95
U238 207 0.11 3.33
As75 35 0.02 3.23
Fe57 20 178.99 3.73
#create_report(data.melt) (Only run if you need to.)

Distribution of Metals Data

hist(Metals$Mn55)

hist(Metals$Cu65)

hist(Metals$Cd111)

hist(Metals$Hg202)

hist(Metals$Pb208)

hist(Metals$U238)

hist(Metals$As75)

#If negative values are excluded, lognormal is appropriate.

hist(Metals$Fe57)

Hormone Levels

#Looking at total levels first before free 
TSH <- ggplot(Metals) +geom_boxplot(aes(y=TSH)) +theme(axis.ticks.x = element_blank(),axis.text.x = element_blank()) +theme_classic() +ylab("TSH (units)")
T3 <- ggplot(Metals) +geom_boxplot(aes(y=TotalT3)) +theme(axis.ticks.x = element_blank(),axis.text.x = element_blank()) +theme_classic() +ylab("Total T3 (units)")
T4 <- ggplot(Metals) +geom_boxplot(aes(y=TotalT4)) +theme(axis.ticks.x = element_blank(),axis.text.x = element_blank()) +theme_classic() +ylab("Total T4 (units)")
Cortisol <- ggplot(Metals) +geom_boxplot(aes(y=Cortisol)) +theme(axis.ticks.x = element_blank(),axis.text.x = element_blank()) +theme_classic() +ylab("Cortisol (units)")

grid.arrange(TSH, Cortisol, T3, T4)

#Looking at distributions
hist(Metals$TSH)

hist(Metals$TotalT3)

hist(Metals$TotalT4)

hist(Metals$Cortisol)

#Levels by date
data.melt2 <- melt(Metals,id.vars=c('ID','Date'), measure.vars=c('TSH','TotalT3','TotalT4', 'Cortisol'))

ggplot(data.melt2, aes(y=value, x=Date, color=variable))+
  geom_point()+
  geom_smooth(se=F)+
  theme(panel.background = element_rect(fill="#ffffff"),
        axis.text.x=element_text(angle=60, hjust=1),
        axis.line = element_line(color="#000000"))+
  scale_x_date(date_breaks = "1 month", date_labels =  "%b")+
  ylab("Hormone Levels (Units)")+
  xlab("Date of Sample Collection")+
  labs(color='Hormones')
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#There are some dates here from 2013, but the concentration values are blank, so I removed them from the graphical display.

#create_report(data.melt2) - Use when needed.

Plotting Hormone Levels Against Metal Concentrations

#This may be irrelevant if we use a PCA, but doing it anyway just to see...
#I can always do this again but with the simplified results of the PCA

#Cortisol
Mn_Cortisol <- ggplot(Metals, aes(x=log(Mn55), y=Cortisol))+geom_point()
Cu_Cortisol <- ggplot(Metals, aes(x=log(Cu65), y=Cortisol))+geom_point()
Cd_Cortisol <- ggplot(Metals, aes(x=log(Cd111), y=Cortisol))+geom_point()
Hg_Cortisol <- ggplot(Metals, aes(x=log(Hg202), y=Cortisol))+geom_point()
Pb_Cortisol <- ggplot(Metals, aes(x=log(Pb208), y=Cortisol))+geom_point()
U_Cortisol <- ggplot(Metals, aes(x=log(U238), y=Cortisol))+geom_point()
As_Cortisol <- ggplot(Metals, aes(x=log(As75), y=Cortisol))+geom_point()
Fe_Cortisol <- ggplot(Metals, aes(x=log(Fe57), y=Cortisol))+geom_point()

grid.arrange(Mn_Cortisol, Cu_Cortisol, Cd_Cortisol, Hg_Cortisol, Pb_Cortisol, U_Cortisol, As_Cortisol, Fe_Cortisol)
## Warning: Removed 129 rows containing missing values (geom_point).
## Removed 129 rows containing missing values (geom_point).
## Removed 129 rows containing missing values (geom_point).
## Warning: Removed 215 rows containing missing values (geom_point).
## Warning: Removed 129 rows containing missing values (geom_point).
## Removed 129 rows containing missing values (geom_point).
## Warning in log(As75): NaNs produced

## Warning in log(As75): NaNs produced
## Warning: Removed 297 rows containing missing values (geom_point).
## Warning: Removed 310 rows containing missing values (geom_point).

An Analysis of Survey and Sample Participant Demographics in Yuma County, Arizona

This project will consider the associations between concentration levels of metals and metalloids measured in hair samples of 330 Yuma County residents in 2018 and the prevalence of negative health outcomes, including endocrine disruption, reproductive disorders, obesity, and neurological disorders.

\(Note:\) Before running this code, Data_Clean.Rmd and Yuma_Metals_Analysis.Rmb should be run to clean the dataset used in this analysis.

General Demographics Table

colnames(Metals)
##   [1] "ID"                        "Mn55"                     
##   [3] "Cu65"                      "Cd111"                    
##   [5] "Hg202"                     "Pb208"                    
##   [7] "U238"                      "Site"                     
##   [9] "Gender"                    "DOB_month"                
##  [11] "DOB_day"                   "DOB_year"                 
##  [13] "Ethnicity"                 "Race_ai_an"               
##  [15] "Race_asian"                "Race_black"               
##  [17] "Race_pac_isl"              "Race_white"               
##  [19] "Race_other"                "Race_dk"                  
##  [21] "Race_refuse"               "Race_other_TEXT"          
##  [23] "Marital_status"            "Num_in_household"         
##  [25] "Num_children"              "Child_under_18"           
##  [27] "Education"                 "Education_TEXT"           
##  [29] "Birth_country"             "Birth_country_TEXT"       
##  [31] "Armed_forces"              "Army"                     
##  [33] "Navy"                      "Marine_corp"              
##  [35] "Air_force"                 "Coast_guard"              
##  [37] "Other"                     "Armed_forces_dk"          
##  [39] "Armed_forces_refuse"       "Armed_forces_TEXT"        
##  [41] "Work_for_pay"              "Self_employed"            
##  [43] "Looking"                   "Laid_off"                 
##  [45] "Retired"                   "Homemaker"                
##  [47] "Student"                   "Disabled"                 
##  [49] "Maternity_sick_leave"      "employ_other"             
##  [51] "employ_other_A"            "employ_dk"                
##  [53] "employ_TEXT"               "Occupation"               
##  [55] "Farmer"                    "Pesticides"               
##  [57] "Lived_in_Yuma"             "Born_in_Yuma"             
##  [59] "Yuma_pregnant_mom"         "Residence_type"           
##  [61] "Residence_TEXT"            "Own_rent"                 
##  [63] "Own_rent_TEXT"             "Homeless"                 
##  [65] "Water_home"                "Water_home_TEXT"          
##  [67] "Water_cooking"             "Water_cooking_TEXT"       
##  [69] "Water_coffee"              "Water_coffee_TEXT"        
##  [71] "Water_work"                "Water_work_TEXT"          
##  [73] "Health_care"               "Health_care_source"       
##  [75] "Health_care_source_TEXT"   "Healthcare_lastyear"      
##  [77] "Health_consult"            "Health_consult_place"     
##  [79] "Health_consult_place_TEXT" "Routine_care"             
##  [81] "Routine_care_TEXT"         "Doctor_time"              
##  [83] "ER_visits"                 "afford_rx_meds"           
##  [85] "afford_mh_care"            "afford_dental_care"       
##  [87] "afford_eyeglasses"         "afford_specialist"        
##  [89] "afford_follow_up"          "Delay_telephone"          
##  [91] "Delay_appt"                "Delay_wait"               
##  [93] "Delay_open"                "Delay_transportation"     
##  [95] "Mental_health"             "General_health"           
##  [97] "Health_compare"            "Height_ft"                
##  [99] "Height_inch"               "Weight"                   
## [101] "Overweight"                "Dental_visit"             
## [103] "Dental_health"             "arthritis"                
## [105] "gout"                      "chf"                      
## [107] "chd"                       "angnia"                   
## [109] "heart_attack"              "stroke"                   
## [111] "emphysema"                 "thyroid_prob"             
## [113] "hyperthyroid"              "hypothyroid"              
## [115] "thyroid_cancer"            "goiter"                   
## [117] "weak_kidney"               "cancer"                   
## [119] "pos"                       "undescended"              
## [121] "ambiguous"                 "HTN"                      
## [123] "HTN_hx_meds"               "HTN_current_med"          
## [125] "Cholesterol"               "Chol_med"                 
## [127] "Diabetes"                  "DM_age"                   
## [129] "DM_age_TEXT"               "insulin"                  
## [131] "diabetic_pills"            "Income"                   
## [133] "age"                       "thyroid_prob_D"           
## [135] "Hyper_D"                   "Hypo_D"                   
## [137] "Thyroid_cancer_D"          "Goiter_D"                 
## [139] "thyroid_conditions"        "clinical"                 
## [141] "endo"                      "fT3"                      
## [143] "TSH"                       "fT4"                      
## [145] "TotalT4"                   "TotalT3"                  
## [147] "Cortisol"                  "latitude"                 
## [149] "longitude"                 "emr_height_inches"        
## [151] "emr_weight"                "emr_bmi"                  
## [153] "emr_thyroid_problem"       "emr_hyperthyroidism"      
## [155] "emr_hypothyroidism"        "emr_thyroid_cancer"       
## [157] "emr_goiter"                "emr_cancer"               
## [159] "emr_pos"                   "emr_diabetes"             
## [161] "emr_obesity"               "sc"                       
## [163] "emr_sud"                   "emr_asd"                  
## [165] "meds"                      "meds_name"                
## [167] "thyroid_med"               "free_t3_med"              
## [169] "tsh_med"                   "total_t4_med"             
## [171] "free_t4_med"               "total_t3_med"             
## [173] "cortisol_med"              "Fe57"                     
## [175] "As75"                      "height_total"             
## [177] "bmi"                       "age_group"                
## [179] "Site_Collapsed"            "hyper_recode"             
## [181] "hypo_recode"               "thy_cancer_recode"        
## [183] "goiter_recode"             "pos_recode"               
## [185] "dm_recode"                 "emr_dm_recode"            
## [187] "sud"                       "tobacco_history"          
## [189] "number_years"              "pack_per_day"             
## [191] "quit_date"                 "tremor"                   
## [193] "FAC1_1"                    "FAC2_1"                   
## [195] "FAC3_1"                    "FAC4_1"                   
## [197] "FAC5_1"                    "Date"                     
## [199] "DOB"
#Date range of responses
kable(Metals %>% 
        # transform to date format with lubridate
        mutate(Date = ymd(Date)) %>% 
        # find min and max
        summarise(min = min(Date, na.rm = T),
                  max = max(Date, na.rm = T)))
min max
2018-04-20 2018-11-26
kable(Metals %>%
        filter(!is.na(ID)) %>%
        summarize("n"=length(ID)))
n
330
kable(t(Metals %>%
          summarize("Male"=sum(Gender==1, na.rm = T),
                    "Female"=sum(Gender==2, na.rm = T),
                    "Refused Gender"=sum(Gender==3, na.rm = T),
                    "18-39"=sum(age>=18 & age<40, na.rm = T),
                    "40-59"=sum(age>=40 & age<60, na.rm = T),
                    "60+"=sum(age>=60, na.rm = T),
                    "Hispanic"=sum(Ethnicity==1, na.rm = T),
                    "Not Hispanic"=sum(Ethnicity==2, na.rm = T),
                    "Married"=sum(Marital_status==1, na.rm=T),
                    "Widowed"=sum(Marital_status==2, na.rm=T),
                    "Divorced"=sum(Marital_status==3, na.rm=T),
                    "Separated"=sum(Marital_status==4, na.rm=T),
                    "Never married"=sum(Marital_status==5, na.rm=T),
                    "No school"=sum(Education==1,  na.rm = T),
                    "Elementary school"=sum(Education==2,  na.rm = T),
                    "Middle school"=sum(Education==3,  na.rm = T),
                    "High school"=sum(Education==4,  na.rm = T),
                    "College"=sum(Education==5,  na.rm = T),
                    "Advanced degree"=sum(Education==6,  na.rm = T),
                    "Other"=sum(Education==7,  na.rm = T))))
Male 68
Female 255
Refused Gender 0
18-39 93
40-59 153
60+ 72
Hispanic 286
Not Hispanic 37
Married 197
Widowed 20
Divorced 44
Separated 10
Never married 48
No school 20
Elementary school 41
Middle school 53
High school 93
College 80
Advanced degree 18
Other 18
kable(t(Metals %>%
          group_by(Site) %>%
          summarize("n"=length(Site),
                    "Male"=sum(Gender==1, na.rm = T),
                    "Female"=sum(Gender==2, na.rm = T),
                    "Refused Gender"=sum(Gender==3, na.rm = T),
                    "18-39"=sum(age>=18 & age<40, na.rm = T),
                    "40-59"=sum(age>=40 & age<60, na.rm = T),
                    "60+"=sum(age>=60, na.rm = T),
                    "Hispanic"=sum(Ethnicity==1, na.rm = T),
                    "Not Hispanic"=sum(Ethnicity==2, na.rm = T),
                    "Married"=sum(Marital_status==1, na.rm=T),
                    "Widowed"=sum(Marital_status==2, na.rm=T),
                    "Divorced"=sum(Marital_status==3, na.rm=T),
                    "Separated"=sum(Marital_status==4, na.rm=T),
                    "Never married"=sum(Marital_status==5, na.rm=T),
                    "No school"=sum(Education==1,  na.rm = T),
                    "Elementary school"=sum(Education==2,  na.rm = T),
                    "Middle school"=sum(Education==3,  na.rm = T),
                    "High school"=sum(Education==4,  na.rm = T),
                    "College"=sum(Education==5,  na.rm = T),
                    "Advanced degree"=sum(Education==6,  na.rm = T),
                    "Other"=sum(Education==7,  na.rm = T))))
Site 1 2 3
n 78 126 126
Male 28 17 23
Female 47 106 102
Refused Gender 0 0 0
18-39 22 19 52
40-59 35 69 49
60+ 15 35 22
Hispanic 75 123 88
Not Hispanic 0 0 37
Married 35 86 76
Widowed 8 5 7
Divorced 9 16 19
Separated 4 6 0
Never married 16 10 22
No school 6 13 1
Elementary school 6 33 2
Middle school 13 28 12
High school 20 32 41
College 20 11 49
Advanced degree 4 2 12
Other 6 4 8
#Below will give a % of folks who rated their general health as fair (4) or poor (5)
(Metals %>% 
        filter(General_health==4|General_health==5) %>% 
        summarise(Count = n()))/sum(!is.na(Metals$General_health))
##       Count
## 1 0.4643963
#Below will give a % of folks who are listed as having a "thyroid problem"
(Metals %>% 
  filter(thyroid_prob_D==1) %>% 
  summarise(Count = n()))/sum(!is.na(Metals$thyroid_prob_D))
##     Count
## 1 0.44375
(Metals %>% 
  filter(emr_thyroid_problem==1) %>% 
  summarise(Count = n()))/sum(!is.na(Metals$emr_thyroid_problem))
##       Count
## 1 0.3174603

Age

Metals$age <- as.numeric(Metals$age)

knitr::kable(Metals %>%
               summarise(mean=round(mean(age, na.rm=T), digits=1),
                         median=round(median(age, na.rm=T), digits=1),
                         min=round(min(age, na.rm=T), digits=1),
                         max=round(max(age, na.rm=T), digits=1),
                         NAs=sum(is.na(age))))
mean median min max NAs
48.4 50.4 18.5 89.9 12

Height, Weight, and BMI

knitr::kable(summary(round(Metals[c("emr_height_inches", "emr_weight", "emr_bmi", "height_total", "Weight", "bmi")]), digits=1), col.names = c("EMR Height (inches)", "EMR Weight (lbs)", "EMR BMI", "Measured Height (inches)", "Measured Weight (lbs)", "Measured BMI"))
EMR Height (inches) EMR Weight (lbs) EMR BMI Measured Height (inches) Measured Weight (lbs) Measured BMI
Min. :52 Min. : 92 Min. :17 Min. :54 Min. : 97 Min. :18
1st Qu.:61 1st Qu.:155 1st Qu.:27 1st Qu.:62 1st Qu.:152 1st Qu.:26
Median :63 Median :185 Median :32 Median :64 Median :179 Median :30
Mean :63 Mean :190 Mean :33 Mean :64 Mean :180 Mean :31
3rd Qu.:65 3rd Qu.:215 3rd Qu.:37 3rd Qu.:66 3rd Qu.:200 3rd Qu.:35
Max. :72 Max. :360 Max. :64 Max. :80 Max. :360 Max. :56
NA’s :138 NA’s :135 NA’s :150 NA’s :12 NA’s :7 NA’s :12

Determining if Differences Exist in EMR versus Measured Data

t.test(Metals$emr_bmi, Metals$bmi, paired = T)
## 
##  Paired t-test
## 
## data:  Metals$emr_bmi and Metals$bmi
## t = 5.3486, df = 172, p-value = 2.793e-07
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  0.7307578 1.5855707
## sample estimates:
## mean difference 
##        1.158164
#Some data is missing from the EMRs for BMI. The BMIs between the the EMR and measurements are different in general, but also there are differences between the paired data.

Statistical Differences Among Sites

summary(aov(Gender~as.factor(Site), data=Metals))
##                  Df Sum Sq Mean Sq F value   Pr(>F)    
## as.factor(Site)   2   2.72  1.3596   8.536 0.000244 ***
## Residuals       320  50.97  0.1593                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 7 observations deleted due to missingness
summary(aov(age~as.factor(Site), data=Metals))
##                  Df Sum Sq Mean Sq F value   Pr(>F)    
## as.factor(Site)   2   4099  2049.5   8.998 0.000158 ***
## Residuals       315  71749   227.8                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 12 observations deleted due to missingness
summary(aov(Ethnicity~as.factor(Site), data=Metals))
##                  Df Sum Sq Mean Sq F value Pr(>F)    
## as.factor(Site)   2  6.714   3.357   41.24 <2e-16 ***
## Residuals       320 26.048   0.081                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 7 observations deleted due to missingness
summary(aov(Marital_status~as.factor(Site), data=Metals))
##                  Df Sum Sq Mean Sq F value  Pr(>F)   
## as.factor(Site)   2   31.0  15.484   6.433 0.00182 **
## Residuals       320  770.2   2.407                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 7 observations deleted due to missingness
summary(aov(Education~as.factor(Site), data=Metals))
##                  Df Sum Sq Mean Sq F value   Pr(>F)    
## as.factor(Site)   2  138.2   69.10   38.74 8.61e-16 ***
## Residuals       320  570.9    1.78                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 7 observations deleted due to missingness
pairwise.t.test(Metals$Gender, as.factor(Metals$Site), adj="bonf")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  Metals$Gender and as.factor(Metals$Site) 
## 
##   1       2      
## 2 0.00022 -      
## 3 0.00257 0.36700
## 
## P value adjustment method: holm
pairwise.t.test(Metals$age, as.factor(Metals$Site), adj="bonf")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  Metals$age and as.factor(Metals$Site) 
## 
##   1      2     
## 2 0.0414 -     
## 3 0.1992 0.0001
## 
## P value adjustment method: holm
pairwise.t.test(Metals$Ethnicity, as.factor(Metals$Site), adj="bonf")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  Metals$Ethnicity and as.factor(Metals$Site) 
## 
##   1       2      
## 2 1       -      
## 3 1.6e-11 2.2e-14
## 
## P value adjustment method: holm
pairwise.t.test(Metals$Marital_status, as.factor(Metals$Site), adj="bonf")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  Metals$Marital_status and as.factor(Metals$Site) 
## 
##   1      2     
## 2 0.0012 -     
## 3 0.0740 0.0857
## 
## P value adjustment method: holm
pairwise.t.test(Metals$Education, as.factor(Metals$Site), adj="bonf")
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  Metals$Education and as.factor(Metals$Site) 
## 
##   1       2      
## 2 1.2e-05 -      
## 3 0.003   3.5e-16
## 
## P value adjustment method: holm

Overview

PCA Code Written by Dr. Dean Billheimer

Explore the distributional properties of (hair?) metals concentrations from Yuma, AZ. Also, use Rmd file to illustrate use of R and markdown for computationally reproducible research.

Changes since 2021-08-21

  1. Values missing all metals are true missing – insuffucent sample.
  2. Values with missing Hg are also true missing (about 89 of these)
  • fit PCA excluding missing Hg
  • 2nd fit using imputed Hg from other metals data.
  1. drop sample R1108 - missing Cu value - not identified in spectrum. This was included in a batch of samples that failed QA/QC check

Major changes since 2020-07-03 version

  1. Updated data from Jonathan Credo - using 2021-08-23 version - sheet 3
  2. Many of the NAs are true missing -
  3. Jonathan re-coded BDL values to DL/sqrt(2)
  4. Re-run the analysis.
  5. Produce a new data set with robust PCA components for Jonathan’s use.

Major changes since 2020-06-23 version

  1. Omit observations where most metals are unmeasured
  2. Omit Pb206 and 207
  3. What to do about Hg NAs?

Takeaway

Standard and robust PCA yield generally similar results. There are enough differences, however, that the robust method would be my first choice for data summarization.

R Stuff

Set-up Environment and Read Data

#setwd
setwd("~/Desktop/Research/Yuma/Yuma - R Project/")

## read data
##filename <- c("Yuma Metals Project - EMR 19Jun20.xlsx")
filename <- c("Yuma Metals Project_EMR_Editted 23Aug2021 mental health & ID removed.xlsx") ## new data set
tmp <- read_excel(filename, sheet=3)
dim(tmp) ## 330 222 ## same dimensions
## [1] 330 199
 names(tmp)
##   [1] "ID"                        "Mn55"                     
##   [3] "Cu65"                      "Cd111"                    
##   [5] "Hg202"                     "Pb208"                    
##   [7] "U238"                      "Site"                     
##   [9] "Date"                      "Gender"                   
##  [11] "DOB_month"                 "DOB_day"                  
##  [13] "DOB_year"                  "DOB"                      
##  [15] "Ethnicity"                 "Race_ai_an"               
##  [17] "Race_asian"                "Race_black"               
##  [19] "Race_pac_isl"              "Race_white"               
##  [21] "Race_other"                "Race_dk"                  
##  [23] "Race_refuse"               "Race_other_TEXT"          
##  [25] "Marital_status"            "Num_in_household"         
##  [27] "Num_children"              "Child_under_18"           
##  [29] "Education"                 "Education_TEXT"           
##  [31] "Birth_country"             "Birth_country_TEXT"       
##  [33] "Armed_forces"              "Army"                     
##  [35] "Navy"                      "Marine_corp"              
##  [37] "Air_force"                 "Coast_guard"              
##  [39] "Other"                     "Armed_forces_dk"          
##  [41] "Armed_forces_refuse"       "Armed_forces_TEXT"        
##  [43] "Work_for_pay"              "Self_employed"            
##  [45] "Looking"                   "Laid_off"                 
##  [47] "Retired"                   "Homemaker"                
##  [49] "Student"                   "Disabled"                 
##  [51] "Maternity_sick_leave"      "employ_other"             
##  [53] "employ_other_A"            "employ_dk"                
##  [55] "employ_TEXT"               "Occupation"               
##  [57] "Farmer"                    "Pesticides"               
##  [59] "Lived_in_Yuma"             "Born_in_Yuma"             
##  [61] "Yuma_pregnant_mom"         "Residence_type"           
##  [63] "Residence_TEXT"            "Own_rent"                 
##  [65] "Own_rent_TEXT"             "Homeless"                 
##  [67] "Water_home"                "Water_home_TEXT"          
##  [69] "Water_cooking"             "Water_cooking_TEXT"       
##  [71] "Water_coffee"              "Water_coffee_TEXT"        
##  [73] "Water_work"                "Water_work_TEXT"          
##  [75] "Health_care"               "Health_care_source"       
##  [77] "Health_care_source_TEXT"   "Healthcare_lastyear"      
##  [79] "Health_consult"            "Health_consult_place"     
##  [81] "Health_consult_place_TEXT" "Routine_care"             
##  [83] "Routine_care_TEXT"         "Doctor_time"              
##  [85] "ER_visits"                 "afford_rx_meds"           
##  [87] "afford_mh_care"            "afford_dental_care"       
##  [89] "afford_eyeglasses"         "afford_specialist"        
##  [91] "afford_follow_up"          "Delay_telephone"          
##  [93] "Delay_appt"                "Delay_wait"               
##  [95] "Delay_open"                "Delay_transportation"     
##  [97] "Mental_health"             "General_health"           
##  [99] "Health_compare"            "Height_ft"                
## [101] "Height_inch"               "Weight"                   
## [103] "Overweight"                "Dental_visit"             
## [105] "Dental_health"             "arthritis"                
## [107] "gout"                      "chf"                      
## [109] "chd"                       "angnia"                   
## [111] "heart_attack"              "stroke"                   
## [113] "emphysema"                 "thyroid_prob"             
## [115] "hyperthyroid"              "hypothyroid"              
## [117] "thyroid_cancer"            "goiter"                   
## [119] "weak_kidney"               "cancer"                   
## [121] "pos"                       "undescended"              
## [123] "ambiguous"                 "HTN"                      
## [125] "HTN_hx_meds"               "HTN_current_med"          
## [127] "Cholesterol"               "Chol_med"                 
## [129] "Diabetes"                  "DM_age"                   
## [131] "DM_age_TEXT"               "insulin"                  
## [133] "diabetic_pills"            "Income"                   
## [135] "age"                       "thyroid_prob_D"           
## [137] "Hyper_D"                   "Hypo_D"                   
## [139] "Thyroid_cancer_D"          "Goiter_D"                 
## [141] "thyroid_conditions"        "clinical"                 
## [143] "endo"                      "fT3"                      
## [145] "TSH"                       "fT4"                      
## [147] "TotalT4"                   "TotalT3"                  
## [149] "Cortisol"                  "latitude"                 
## [151] "longitude"                 "emr_height_inches"        
## [153] "emr_weight"                "emr_bmi"                  
## [155] "emr_thyroid_problem"       "emr_hyperthyroidism"      
## [157] "emr_hypothyroidism"        "emr_thyroid_cancer"       
## [159] "emr_goiter"                "emr_cancer"               
## [161] "emr_pos"                   "emr_diabetes"             
## [163] "emr_obesity"               "sc"                       
## [165] "emr_sud"                   "emr_asd"                  
## [167] "meds"                      "meds_name"                
## [169] "thyroid_med"               "free_t3_med"              
## [171] "tsh_med"                   "total_t4_med"             
## [173] "free_t4_med"               "total_t3_med"             
## [175] "cortisol_med"              "Fe_57"                    
## [177] "As_75"                     "height_total"             
## [179] "bmi"                       "age_group"                
## [181] "Site_Collapsed"            "hyper_recode"             
## [183] "hypo_recode"               "thy_cancer_recode"        
## [185] "goiter_recode"             "pos_recode"               
## [187] "dm_recode"                 "emr_dm_recode"            
## [189] "sud"                       "tobacco_history"          
## [191] "number_years"              "pack_per_day"             
## [193] "quit_date"                 "tremor"                   
## [195] "FAC1_1"                    "FAC2_1"                   
## [197] "FAC3_1"                    "FAC4_1"                   
## [199] "FAC5_1"
 ##  [1] "ID"                        "Mn55"                     
 ##  [3] "Cu65"                      "Cd111"                    
 ##  [5] "Hg202"                     "Pb206"                    
 ##  [7] "Pb207"                     "PB208"                    
 ##  [9] "U238"                      "Site"                     
 ## [11] "Date"                      "Gender"                   

## head(tmp)

## keep only ID, metals and Site, cast to data frame
metals <- data.frame(tmp[,c(1:8)])
names(metals)  ## omitted Pb206 and Pb207
## [1] "ID"    "Mn55"  "Cu65"  "Cd111" "Hg202" "Pb208" "U238"  "Site"
## metals check format

Perform Initial Checks and Summary

The summary below shows there are 123 NAs for nearly all of the metals. Hg202 is the exception with 212 NAs.

Data exploration shows that the NA pattern is consistent across the metals. If one is missing, all are missing.

There are fewer measures at site 1. More missing values at site 1 (51/78) and at site 3 (48/126) compared with site 2 (24/126)

## Initial summary
summary(metals)
##       ID                 Mn55             Cu65               Cd111       
##  Length:330         Min.   :  0.06   Min.   :   0.1626   Min.   :0.0495  
##  Class :character   1st Qu.:  0.36   1st Qu.:  10.4350   1st Qu.:0.0495  
##  Mode  :character   Median :  0.75   Median :  20.7800   Median :0.0495  
##                     Mean   :  3.68   Mean   :  76.2073   Mean   :0.1292  
##                     3rd Qu.:  1.72   3rd Qu.:  49.9650   3rd Qu.:0.0495  
##                     Max.   :103.32   Max.   :1704.1300   Max.   :3.1000  
##                     NA's   :123      NA's   :123         NA's   :123     
##      Hg202              Pb208             U238               Site      
##  Min.   : 0.00707   Min.   : 0.020   Min.   : 0.03536   Min.   :1.000  
##  1st Qu.: 0.13050   1st Qu.: 0.180   1st Qu.: 0.03536   1st Qu.:2.000  
##  Median : 0.27300   Median : 0.440   Median : 0.08000   Median :2.000  
##  Mean   : 1.59401   Mean   : 1.825   Mean   : 0.36315   Mean   :2.145  
##  3rd Qu.: 0.48325   3rd Qu.: 1.160   3rd Qu.: 0.21500   3rd Qu.:3.000  
##  Max.   :60.17500   Max.   :76.450   Max.   :21.08000   Max.   :3.000  
##  NA's   :212        NA's   :123      NA's   :123
## Number of Observations per Site
table(metals$Site) ## 78 126 and 126.  Fewer at site 1
## 
##   1   2   3 
##  78 126 126
## Number of values > BDL by Site
table(metals$Site, !is.na(metals[,2])) ## Observation indicator by site
##    
##     FALSE TRUE
##   1    51   27
##   2    24  102
##   3    48   78

Metals Concentrations

The figure below shows metals concentrations on log10 scale. NA values are omitted.

The shape of the Pb distributions are nearly identical (omitted 206 and 207). On the whole, these distributions are not as bad as I feared. All are stil right-skewed after transformation, but they do not look pathologic.

## change from wide to long format for plotting

metals %>% pivot_longer(-c(ID, Site), names_to="Metal", values_to="Concentration") %>%
    ggplot(aes(x=Concentration)) + geom_density() +
        scale_x_log10() + facet_wrap(~ Metal) +
            ggtitle("Distributions of Metals Concentrations (log10 transform)")
## Warning: Removed 827 rows containing non-finite values (stat_density).
Metals concentrations - log10 scale

Metals concentrations - log10 scale

Remove Missing Values

Values labeled as NA across (nearly all metals) are not measured. For each observation with missing metals, omit the observation. In addition there are 89 observations with missing Hg values. We may impute these missing values based on site and other observed metals

Note - Other BDL values are replaced with 1/2 the limit of detection.

names(metals)
## [1] "ID"    "Mn55"  "Cu65"  "Cd111" "Hg202" "Pb208" "U238"  "Site"
metals.obs <- metals[!is.na(metals[,2]), ]
## for Hg, replace NA and zero with 0.029/sqrt(2)
##sort(metals.obs[,"Hg202"])
## Hg.subs <- 0.029 /sqrt(2)

metals.complete.cases <- metals.obs %>% filter(!is.na(Cu65)) %>% 
    filter(!is.na(Hg202)) ## ID R1108 also missing Hg, other metals observed

dim(metals.complete.cases) ## 111 8 wow - small data set
## [1] 111   8
summary(metals.complete.cases)
##       ID                 Mn55              Cu65               Cd111        
##  Length:111         Min.   :  0.070   Min.   :   0.1626   Min.   :0.04950  
##  Class :character   1st Qu.:  0.355   1st Qu.:  14.1900   1st Qu.:0.04950  
##  Mode  :character   Median :  0.810   Median :  24.7600   Median :0.04950  
##                     Mean   :  4.727   Mean   :  94.9425   Mean   :0.13170  
##                     3rd Qu.:  2.175   3rd Qu.:  73.0850   3rd Qu.:0.06475  
##                     Max.   :103.320   Max.   :1088.6500   Max.   :2.87000  
##      Hg202              Pb208             U238              Site      
##  Min.   : 0.00707   Min.   : 0.020   Min.   :0.03535   Min.   :1.000  
##  1st Qu.: 0.14550   1st Qu.: 0.185   1st Qu.:0.03535   1st Qu.:2.000  
##  Median : 0.29100   Median : 0.450   Median :0.08000   Median :2.000  
##  Mean   : 1.68130   Mean   : 2.230   Mean   :0.34223   Mean   :2.216  
##  3rd Qu.: 0.49500   3rd Qu.: 1.225   3rd Qu.:0.24500   3rd Qu.:3.000  
##  Max.   :60.17500   Max.   :76.450   Max.   :9.24000   Max.   :3.000
metals.impute.Hg <- metals.obs %>% filter(!is.na(Cu65)) %>% 
    filter(is.na(Hg202))

metals.complete.log10 <- cbind(ID=metals.complete.cases[,"ID"],
                               log10(metals.complete.cases[,2:7]),
                               Site=metals.complete.cases[,"Site"])

summary(metals.complete.log10)
##       ID                 Mn55               Cu65             Cd111        
##  Length:111         Min.   :-1.15490   Min.   :-0.7888   Min.   :-1.3054  
##  Class :character   1st Qu.:-0.45016   1st Qu.: 1.1520   1st Qu.:-1.3054  
##  Mode  :character   Median :-0.09151   Median : 1.3938   Median :-1.3054  
##                     Mean   : 0.05038   Mean   : 1.5370   Mean   :-1.1448  
##                     3rd Qu.: 0.33745   3rd Qu.: 1.8638   3rd Qu.:-1.2012  
##                     Max.   : 2.01418   Max.   : 3.0369   Max.   : 0.4579  
##      Hg202             Pb208               U238              Site      
##  Min.   :-2.1505   Min.   :-1.69897   Min.   :-1.4515   Min.   :1.000  
##  1st Qu.:-0.8373   1st Qu.:-0.73299   1st Qu.:-1.4515   1st Qu.:2.000  
##  Median :-0.5361   Median :-0.34679   Median :-1.0969   Median :2.000  
##  Mean   :-0.5300   Mean   :-0.31046   Mean   :-0.9353   Mean   :2.216  
##  3rd Qu.:-0.3055   3rd Qu.: 0.08784   3rd Qu.:-0.6109   3rd Qu.:3.000  
##  Max.   : 1.7794   Max.   : 1.88338   Max.   : 0.9657   Max.   :3.000
round(cor(metals.complete.log10[,2:7]), 2)
##       Mn55 Cu65 Cd111 Hg202 Pb208 U238
## Mn55  1.00 0.48  0.46  0.07  0.55 0.15
## Cu65  0.48 1.00  0.30  0.15  0.59 0.20
## Cd111 0.46 0.30  1.00  0.15  0.60 0.20
## Hg202 0.07 0.15  0.15  1.00  0.21 0.09
## Pb208 0.55 0.59  0.60  0.21  1.00 0.26
## U238  0.15 0.20  0.20  0.09  0.26 1.00
metals.impute.log10 <- cbind(ID=metals.impute.Hg[,"ID"],
                               log10(metals.impute.Hg[,2:7]),
                               Site=metals.impute.Hg[,"Site"])
summary(metals.impute.log10)
##       ID                 Mn55               Cu65             Cd111        
##  Length:95          Min.   :-1.22185   Min.   :-0.7888   Min.   :-1.3054  
##  Class :character   1st Qu.:-0.43807   1st Qu.: 0.9668   1st Qu.:-1.3054  
##  Mode  :character   Median :-0.16749   Median : 1.1884   Median :-1.3054  
##                     Mean   :-0.09223   Mean   : 1.1755   Mean   :-1.1551  
##                     3rd Qu.: 0.09320   3rd Qu.: 1.4984   3rd Qu.:-1.3054  
##                     Max.   : 1.56785   Max.   : 3.2315   Max.   : 0.4914  
##                                                                           
##      Hg202         Pb208               U238              Site      
##  Min.   : NA   Min.   :-1.69897   Min.   :-1.4515   Min.   :1.000  
##  1st Qu.: NA   1st Qu.:-0.76955   1st Qu.:-1.4515   1st Qu.:2.000  
##  Median : NA   Median :-0.38722   Median :-1.1549   Median :2.000  
##  Mean   :NaN   Mean   :-0.33314   Mean   :-1.0172   Mean   :2.284  
##  3rd Qu.: NA   3rd Qu.:-0.02228   3rd Qu.:-0.7447   3rd Qu.:3.000  
##  Max.   : NA   Max.   : 1.30168   Max.   : 1.3239   Max.   :3.000  
##  NA's   :95
metals.impute.Hg
##       ID  Mn55         Cu65      Cd111 Hg202 Pb208        U238 Site
## 1  R1005  0.39   59.8300000 0.04949747    NA  0.36  0.03535534    2
## 2  R1001  1.43   33.5000000 0.04949747    NA  0.17  0.07000000    2
## 3  C1005 17.88  179.6800000 0.28000000    NA 20.03  0.14000000    1
## 4  R1011  0.68   44.3900000 0.04949747    NA  0.27  0.06000000    2
## 5  R1009 12.87   13.6600000 0.18000000    NA  6.51  0.25000000    2
## 6  C1013  0.82    9.2600000 0.04949747    NA  0.30  0.03535534    1
## 7  C1014  0.51   32.0500000 0.04949747    NA  0.26  0.03535534    1
## 8  R1018  0.40   18.8100000 0.04949747    NA  0.06  0.03535534    2
## 9  R1019  0.96    9.6300000 0.13000000    NA  2.87  0.03535534    2
## 10 C1017  0.28   13.9100000 0.04949747    NA  0.11  0.03535534    1
## 11 C1022  1.20   22.6400000 0.04949747    NA  0.95  0.15000000    1
## 12 C1026  0.17    6.2200000 0.04949747    NA  1.72  0.22000000    1
## 13 C1018  0.16   22.8200000 0.04949747    NA  0.08  0.23000000    1
## 14 R1021  1.28    0.1626346 0.04949747    NA  0.68  0.03535534    2
## 15 R1022  1.11   97.6600000 0.04949747    NA  0.62  0.64000000    2
## 16 R1025  3.33   29.4200000 0.11000000    NA  5.46  0.37000000    2
## 17 R1026  2.03   21.9900000 0.42000000    NA  0.95  0.25000000    2
## 18 R1032  0.54   32.1600000 0.04949747    NA  0.81  1.50000000    2
## 19 R1043  1.05   13.6900000 0.04949747    NA  0.53  0.06000000    2
## 20 R1050  0.75   20.7800000 0.04949747    NA  0.04  0.03535534    2
## 21 R1049  0.18    7.0400000 0.04949747    NA  0.48  0.03535534    2
## 22 R1051 11.08   15.6500000 0.04949747    NA  0.14  0.05000000    2
## 23 R1062  0.57    0.1626346 0.04949747    NA  0.02  0.03535534    2
## 24 R1035  0.47   59.0400000 0.04949747    NA  0.38  0.03535534    2
## 25 R1059  0.78   23.8800000 0.04949747    NA  1.25  0.03535534    2
## 26 C1035  0.46    9.8000000 0.04949747    NA  0.73  0.03535534    1
## 27 C1038  0.71    9.4900000 0.17000000    NA  0.88  0.03535534    1
## 28 C1037  0.81    3.3000000 0.09000000    NA  1.69  0.03535534    1
## 29 C1036  1.71   27.6000000 0.04949747    NA  0.06  0.05000000    1
## 30 R1038  0.50   19.5300000 0.12000000    NA  0.41  0.13000000    2
## 31 R1070  0.49   32.3300000 0.04949747    NA  0.12  0.03535534    2
## 32 R1040  0.81   27.0800000 0.04949747    NA  0.68  0.03535534    2
## 33 R1071  6.16   12.0800000 0.04949747    NA  1.15  0.22000000    2
## 34 R1074  0.61    4.6500000 0.04949747    NA  0.40  0.38000000    2
## 35 R1084  1.00   43.8200000 0.04949747    NA  0.16  0.03535534    2
## 36 R1082 21.35  131.1400000 0.04949747    NA  1.17  0.03535534    2
## 37 R1086  5.91  149.6300000 0.14000000    NA  1.52  0.29000000    2
## 38 R1093  0.35   10.1400000 0.04949747    NA  0.41  0.03535534    2
## 39 R1102  0.35   21.7900000 0.04949747    NA  0.10  0.03535534    2
## 40 R1103  0.30   47.7300000 0.04949747    NA  0.58  0.03535534    2
## 41 C1057  2.28    0.1626346 0.04949747    NA  3.19  0.03535534    1
## 42 R1101  0.54   62.4500000 0.04949747    NA  0.78  0.11000000    2
## 43 R1107  0.76   19.5300000 0.04949747    NA  0.61  0.05000000    2
## 44 R1109  6.77  417.0400000 0.30000000    NA 13.79  1.10000000    2
## 45 R1112  0.55   13.0900000 0.04949747    NA  1.20  0.17000000    2
## 46 R1110  1.46    9.4500000 0.04949747    NA  1.75  0.20000000    2
## 47 Y1019  0.15    6.0300000 0.04949747    NA  0.06  0.03535534    3
## 48 Y1017  1.02   16.9600000 0.04949747    NA  1.48  0.06000000    3
## 49 Y1018  0.38    8.7900000 0.04949747    NA  0.38  0.42000000    3
## 50 Y1020  0.24   30.1000000 0.04949747    NA  0.14  0.10000000    3
## 51 Y1021  0.34   21.7100000 0.04949747    NA  0.21  0.92000000    3
## 52 Y1024  0.38   12.9100000 0.04949747    NA  0.13  0.11000000    3
## 53 Y1023  0.41   10.1000000 0.04949747    NA  0.34  0.18000000    3
## 54 R1116 12.11 1704.1300000 0.34000000    NA 16.40  0.76000000    2
## 55 Y1026  0.41    7.6400000 0.04949747    NA  0.51  0.13000000    3
## 56 Y1028  1.06   74.7500000 0.04949747    NA  0.31  0.18000000    3
## 57 R1121  1.18   51.0100000 0.12000000    NA  1.41  0.14000000    2
## 58 R1122  0.61  244.5900000 0.11000000    NA  0.13  0.16000000    2
## 59 C1046  5.59   30.9600000 0.04949747    NA  0.67  0.03535534    1
## 60 R1123  0.75   11.3400000 0.04949747    NA  0.29  0.31000000    2
## 61 R1125  0.66   31.2100000 1.00000000    NA  0.86  0.03535534    2
## 62 Y1036  0.06    3.5500000 0.04949747    NA  0.21  0.26000000    3
## 63 Y1043  0.08   11.0700000 0.04949747    NA  0.40  0.15000000    3
## 64 Y1047 31.39   18.9300000 0.14000000    NA  1.86  1.00000000    3
## 65 Y1065  0.46   28.6700000 0.04949747    NA  0.25  0.03535534    3
## 66 Y1048  0.23   30.5000000 0.47000000    NA  0.31  0.03535534    3
## 67 Y1066  0.14   14.1600000 0.04949747    NA  0.15  0.07000000    3
## 68 Y1063  0.21    9.2700000 0.04949747    NA  0.51  0.09000000    3
## 69 Y1045  0.55    9.9700000 0.04949747    NA  0.20  0.14000000    3
## 70 Y1060  0.14   10.8600000 0.04949747    NA  0.17  0.70000000    3
## 71 Y1069  0.31   19.8400000 0.04949747    NA  0.10  0.03535534    3
## 72 Y1075  0.86  185.8700000 3.10000000    NA  0.86  0.03535534    3
## 73 Y1070  0.93    0.1626346 0.04949747    NA  1.72  0.13000000    3
## 74 Y1076  0.28    6.6900000 0.04949747    NA  0.12  0.03535534    3
## 75 Y1077  0.68   15.4300000 0.04949747    NA  0.19  0.03535534    3
## 76 Y1072  0.13    5.9300000 0.04949747    NA  0.06  0.09000000    3
## 77 Y1073  0.17    5.9000000 0.04949747    NA  0.16  0.09000000    3
## 78 Y1068  0.67    9.8500000 0.04949747    NA  0.16  0.34000000    3
## 79 Y1085  1.72    6.0000000 0.10000000    NA  0.85  0.03535534    3
## 80 Y1088  0.47    6.7400000 0.04949747    NA  0.17  0.03535534    3
## 81 Y1086  0.69    3.1100000 0.04949747    NA  0.18  0.03535534    3
## 82 Y1090  0.70    1.5200000 0.04949747    NA  2.06  0.03535534    3
## 83 Y1087  1.08   31.8000000 0.04949747    NA  0.83  0.12000000    3
## 84 Y1096  0.31   14.9800000 0.04949747    NA  0.39  0.03535534    3
## 85 Y1099 36.97   18.7300000 0.33000000    NA  0.86  0.03535534    3
## 86 Y1098  0.77   48.4900000 0.58000000    NA  2.92  0.16000000    3
## 87 Y1097  0.79   41.7600000 0.04949747    NA  0.11  0.25000000    3
## 88 Y1064  4.98    0.1626346 0.04949747    NA 10.17 21.08000000    3
## 89 Y1121  1.42   12.5500000 0.04949747    NA  0.09  0.03535534    3
## 90 Y1119  0.33    6.6000000 0.04949747    NA  0.86  0.17000000    3
## 91 Y1125  2.15    0.1626346 0.04949747    NA  1.59  0.03535534    3
## 92 Y1127  2.32   10.3300000 0.04949747    NA  0.34  0.16000000    3
## 93 Y1126  0.29    4.9300000 0.04949747    NA  0.25  0.39000000    3
## 94 C1104  3.67  506.8900000 0.18000000    NA  0.87  0.03535534    1
## 95 Y1008  0.14   12.6100000 0.04949747    NA  0.03  0.15000000    2
### fit regression line for Hg
Hg.lm <- lm(Hg202 ~ as.factor(Site) + Mn55 + Cu65 + Cd111 + Pb208 + U238,
            data=metals.complete.log10) 
qplot(metals.complete.log10[,"Hg202"], predict(Hg.lm))

qplot( predict(Hg.lm), resid(Hg.lm))

Hg.imputed <-  predict(Hg.lm, newdata=metals.impute.log10)

metals.fillin.log10 <- metals.impute.log10
metals.fillin.log10[,"Hg202"] <- Hg.imputed

summary(metals.fillin.log10)
##       ID                 Mn55               Cu65             Cd111        
##  Length:95          Min.   :-1.22185   Min.   :-0.7888   Min.   :-1.3054  
##  Class :character   1st Qu.:-0.43807   1st Qu.: 0.9668   1st Qu.:-1.3054  
##  Mode  :character   Median :-0.16749   Median : 1.1884   Median :-1.3054  
##                     Mean   :-0.09223   Mean   : 1.1755   Mean   :-1.1551  
##                     3rd Qu.: 0.09320   3rd Qu.: 1.4984   3rd Qu.:-1.3054  
##                     Max.   : 1.56785   Max.   : 3.2315   Max.   : 0.4914  
##      Hg202             Pb208               U238              Site      
##  Min.   :-0.9754   Min.   :-1.69897   Min.   :-1.4515   Min.   :1.000  
##  1st Qu.:-0.6857   1st Qu.:-0.76955   1st Qu.:-1.4515   1st Qu.:2.000  
##  Median :-0.5787   Median :-0.38722   Median :-1.1549   Median :2.000  
##  Mean   :-0.5607   Mean   :-0.33314   Mean   :-1.0172   Mean   :2.284  
##  3rd Qu.:-0.4261   3rd Qu.:-0.02228   3rd Qu.:-0.7447   3rd Qu.:3.000  
##  Max.   :-0.1436   Max.   : 1.30168   Max.   : 1.3239   Max.   :3.000
## combine complete cases with Hg imputed cases
metals.allobs.log10 <- rbind(metals.complete.log10, metals.fillin.log10)
## dim(metals.allobs.log10) ## 206 8  ## OK obs with missing Cu65 is omitted.
## dim(metals.obs) ## 207 8

## check summary
summary(metals.allobs.log10)
##       ID                 Mn55               Cu65             Cd111        
##  Length:206         Min.   :-1.22185   Min.   :-0.7888   Min.   :-1.3054  
##  Class :character   1st Qu.:-0.44990   1st Qu.: 1.0261   1st Qu.:-1.3054  
##  Mode  :character   Median :-0.12494   Median : 1.3212   Median :-1.3054  
##                     Mean   :-0.01539   Mean   : 1.3703   Mean   :-1.1495  
##                     3rd Qu.: 0.23553   3rd Qu.: 1.7031   3rd Qu.:-1.3054  
##                     Max.   : 2.01418   Max.   : 3.2315   Max.   : 0.4914  
##      Hg202             Pb208               U238              Site      
##  Min.   :-2.1505   Min.   :-1.69897   Min.   :-1.4515   Min.   :1.000  
##  1st Qu.:-0.7322   1st Qu.:-0.74473   1st Qu.:-1.4515   1st Qu.:2.000  
##  Median :-0.5680   Median :-0.36154   Median :-1.0969   Median :2.000  
##  Mean   :-0.5442   Mean   :-0.32092   Mean   :-0.9731   Mean   :2.248  
##  3rd Qu.:-0.3821   3rd Qu.: 0.06446   3rd Qu.:-0.6626   3rd Qu.:3.000  
##  Max.   : 1.7794   Max.   : 1.88338   Max.   : 1.3239   Max.   :3.000

Adjusted (Hg Imputed) Concentrations

New figure with Hg values imputed using regression on other metals

## change from wide to long format for plotting

metals.allobs.log10 %>% pivot_longer(-c(ID, Site), names_to="Metal", values_to="Concentration") %>%
    ggplot(aes(x=Concentration)) + geom_density() +
         facet_wrap(~ Metal) +
            ggtitle("Distributions of Metals Concentrations (missin Hg values imputed, log10 transform)")
Metals concentrations - log10 scale

Metals concentrations - log10 scale

Robust PCA

Now we move on to Robust PCA. We use the method described by the following papers.

  1. Hubert, M., Rousseeuw, P. J., and Vanden Branden, K. (2005), “ROBPCA: A New Approach to Robust Principal Component Analysis,” Technometrics, 47, 64–79.

  2. Engelen, S., Hubert, M. and Vanden Branden, K. (2005), “A Comparison of Three Procedures for Robust PCA in High Dimensions”, Austrian Journal of Statistics, 34, 117–126.

  3. Hubert, M., Rousseeuw, P. J., and Verdonck, T. (2009), “Robust PCA for Skewed Data and Its Outlier Map,” Computational Statistics & Data Analysis, 53, 2264–2274.

This is implemented in the R package \(rospca\)

The plot below shows results from the robust PCA. The “Score distance” describes the leverage that an indvidual point has in determining the fitted PCA surface. The “Orthogonal distance” shows the distance to the fitted surface.

Thus, points 31, 48, 85, and 200 heaviy influence the slope of the PCA surface (the loadings). Conversely, point 143 is a large distance from the mass of points, but has little influence on the loadings.

The eigenvalues (bottom of the box) show that of the variation is captured in PC1 (41%) and PC2 (20%).

##install.packages("rospca", repos="http://cran.r-project.org")

##dim(metals.adjusted)
##metals.log10 <- as.matrix(log10(metals.adjusted[, 2:7]))
colnames(metals.allobs.log10)
## [1] "ID"    "Mn55"  "Cu65"  "Cd111" "Hg202" "Pb208" "U238"  "Site"
## round(cor(metals.log10), 2) ## check pairwise correlation
### == original full PCA

metals.pca <- robpca(metals.allobs.log10[,2:7], k=5, alpha=.7, ndir="all", skew=T)

## Eigenvalues show variation in different components
cat("Total Variance in PC1 and PC2\n")
## Total Variance in PC1 and PC2
round(metals.pca[[2]]/sum(metals.pca[[2]]), 3) ## variation in PC's is as expected across components
##   PC1   PC2   PC3   PC4   PC5 
## 0.470 0.220 0.134 0.120 0.056
diagPlot(metals.pca)

The eigenvalues show that PC1 and PC2 exhibit the majority of the variance, PC 3-5 are smaller. In general, the diagnostic plot suggests that robust PCA is warranted

PCA Loadings

The loadings are shown below.

round(metals.pca$loadings, 3)
##         PC1    PC2    PC3    PC4    PC5
## Mn55  0.581  0.064  0.601 -0.533  0.083
## Cu65  0.406  0.269 -0.775 -0.394 -0.035
## Cd111 0.197  0.001 -0.008  0.051  0.007
## Hg202 0.114 -0.036 -0.086  0.181  0.972
## Pb208 0.651 -0.002  0.005  0.708 -0.215
## U238  0.147 -0.960 -0.174 -0.154 -0.041
names(metals.allobs.log10[,2:7])
## [1] "Mn55"  "Cu65"  "Cd111" "Hg202" "Pb208" "U238"
##plot(metals.pca$loadings, pch=rownames(metals.pca$loadings))

There are a few of points to notice about the loadings

  1. All components of PC1 have the same sign. As before, this suggests a “common” effect to all measurements. For example, all metals tend to be “high” or “low” together. May be caused either by global high-low exposure of inviduals, or by mass spectrometry measurement issues.

  2. PC1 has highest loading for Pb208, Mn55, and Cu65

  3. For PC2, U238 is the biggest driver.

PCA Scores for Individuals

Finally, we look at PCA scores by site. The second figure below shows clearly that site 2 has slightly more variation than site 3. But not bad. It appears that site 3 may be slightly greater on PC2 than is site 2.

##names(metals.allobs.log10)
metals.pca.df <- data.frame(Site=factor(metals.allobs.log10$Site), metals.pca$scores)

qplot(x=PC1, y=PC2, data=metals.pca.df, col=Site,
      title="Robust PCA Scores by Site")
## Warning: Ignoring unknown parameters: title

qplot(x=PC1, y=PC2, data=metals.pca.df) + ggtitle("Robust PCA Scores by Site") +
    facet_wrap(~ Site)

## dim(metals.pca$scores)
## dim(metals.allobs.log10)
## New df on 2021.09.06
metals.robpca <- cbind(metals.allobs.log10, metals.pca$scores)
## dim(metals.robpca)
### about here
qplot(x=PC1, y=Mn55, data=metals.robpca, col=as.factor(Site))

qplot(x=PC1, y=Pb208, data=metals.robpca, col=as.factor(Site))

qplot(x=PC2, y=U238, data=metals.robpca, col=as.factor(Site))

write.csv(metals.robpca, file="metals_Impute_log10_PCA_scores.csv", quote=F, row.names=F)
pca_No_Hg <- robpca(metals.allobs.log10[,c(2:4, 6, 7)], k=5, alpha=.7, ndir="all",
                    skew=T)
round(pca_No_Hg[[2]]/sum(pca_No_Hg[[2]]), 3) ## .48  .19 .11
##   PC1   PC2   PC3   PC4   PC5 
## 0.549 0.216 0.124 0.097 0.014
round(pca_No_Hg$loadings, 2)
##        PC1   PC2   PC3   PC4   PC5
## Mn55  0.61  0.17  0.02  0.77  0.10
## Cu65  0.47  0.32  0.68 -0.47  0.03
## Cd111 0.14 -0.05 -0.01  0.03 -0.99
## Pb208 0.56  0.03 -0.70 -0.44  0.07
## U238  0.28 -0.93  0.21 -0.04  0.08
## dim(pca_No_Hg$scores) ## 206 5
metals.noHg.robpca <- cbind(metals.allobs.log10, pca_No_Hg$scores)

write.csv(metals.noHg.robpca, file="metals_NoHg_log10_PCA_scores.csv", quote=F, row.names=F)
##system("pwd")
rmarkdown::render("yuma-metals-20210821.Rmd", 'html_document')

Exploration of Yuma Datasets

Based on meeting with Drs. Dean Billheimer and Frank von Hippel on 9/2/2022, this general analysis will explore how PC1 and PC2 relate to the health outcome data from the survey conducted in Yuma, AZ in 2018.

\(Note:\) The following scripts must be run prior to completing this analysis: Data_Clean.Rmd, Yuma_Metals_Analysis.Rmd, Demographics.Rmd, and yuma-metals-20210821.Rmd. This will provide the accurate dataframes used in the following code.

#Pb isotopes were removed because they were all heavily collinear. 
##Perhaps so much so that the isotopes were calculated based on one reference one.
##Kept Pb208.

#The classic vs. robust PCAs were different enough that Dean felt like keeping the robust version was worth it. Even after logging the data, they are still skewed.

#PC1 is all positively correlated, which shows that it is an analysis of overall exposure.

#PC2 shows which ones are correlated in the same +ve or -ve directions.

Counts of Cases and Controls for Each Outcome

These counts include only those who selected “yes” as cases and only those who explicitly selected “no” as controls. Others who chose options such as “don’t know” or “prefer not to answer” or those who skipped the question were removed from these counts.

#Numbers for each outcome
Case_Count <- Metals %>% 
  summarise("Arthritis"=sum(arthritis==1, na.rm = T),
            "Cancer"=sum(cancer==1, na.rm = T),
            "Cancer - EMR"=sum(emr_cancer==1, na.rm = T),
            "Goiter"=sum(goiter_recode==1, na.rm = T),
            "Goiter - EMR"=sum(emr_goiter==1, na.rm = T),
            "Thyroid Disease"=sum(thyroid_prob==1, na.rm = T),
            "Thyroid Disease - EMR"=sum(emr_thyroid_problem==1, na.rm = T),
            "Hypothyroidism"=sum(hypo_recode==1, na.rm = T),
            "Hypothyroidism - EMR"= sum(emr_hypothyroidism==1, na.rm = T),
            "Hyperthyroidism"=sum(hyper_recode==1, na.rm = T),
            "Hyperthyroidism - EMR"= sum(emr_hyperthyroidism==1, na.rm = T),
            "Thyroid Cancer"=sum(thy_cancer_recode==1, na.rm = T),
            "Thyroid Cancer - EMR"=sum(emr_thyroid_cancer==1, na.rm = T),
            "Diabetes Mellitus"=sum(dm_recode==1, na.rm = T),
            "Diabetes Mellitus - EMR"=sum(emr_dm_recode==1, na.rm = T),
            "Gout"=sum(gout==1, na.rm = T),
            "Congenital Heart Disease (CHD)"=sum(chd==1, na.rm = T),
            "Weak Kidney"=sum(weak_kidney==1, na.rm = T),
            "Hypertension"=sum(HTN==1, na.rm = T),
            "Stroke"=sum(stroke==1, na.rm = T),
            "Angina"=sum(angnia==1, na.rm = T),
            "Congestive Heart Failure (CHF)"=sum(chf==1, na.rm = T),
            "Heart Attack"=sum(heart_attack==1, na.rm = T),
            "Cholesterol"=sum(Cholesterol==1, na.rm = T),
            "PCOS"=sum(pos_recode==1, na.rm = T),
            "PCOS - EMR"=sum(emr_pos==1, na.rm = T),
            "Undescended Testes"=sum(undescended==1, na.rm = T),
            "Emphysema"=sum(emphysema==1, na.rm = T),
            "Autism Spectrum Disorder - EMR"=sum(emr_asd==1, na.rm = T),
            "Overweight"=sum(Overweight==1, na.rm = T),
            "Obesity - EMR"=sum(emr_obesity==1, na.rm = T),
            "Dental Health"=sum(Dental_health==4|Dental_health==5, na.rm = T))

Case_Count <- t(Case_Count)
colnames(Case_Count) <- c("Cases")

Control_Count <- Metals %>% 
  summarise("Arthritis"=sum(arthritis==2, na.rm = T),
            "Cancer"=sum(cancer==2, na.rm = T),
            "Cancer - EMR"=sum(emr_cancer==0, na.rm = T),
            "Goiter"=sum(goiter_recode==0, na.rm = T),
            "Goiter - EMR"=sum(emr_goiter==0, na.rm = T),
            "Thyroid Disease"=sum(thyroid_prob==2, na.rm = T),
            "Thyroid Disease - EMR"=sum(emr_thyroid_problem==0, na.rm = T),
            "Hypothyroidism"=sum(hypo_recode==0, na.rm = T),
            "Hypothyroidism - EMR"= sum(emr_hypothyroidism==0, na.rm = T),
            "Hyperthyroidism"=sum(hyper_recode==0, na.rm = T),
            "Hyperthyroidism - EMR"= sum(emr_hyperthyroidism==0, na.rm = T),
            "Thyroid Cancer"=sum(thy_cancer_recode==0, na.rm = T),
            "Thyroid Cancer - EMR"=sum(emr_thyroid_cancer==0, na.rm = T),
            "Diabetes Mellitus"=sum(dm_recode==0, na.rm = T),
            "Diabetes Mellitus - EMR"=sum(emr_dm_recode==0, na.rm = T),
            "Gout"=sum(gout==2, na.rm = T),
            "Congenital Heart Disease (CHD)"=sum(chd==2, na.rm = T),
            "Weak Kidney"=sum(weak_kidney==2, na.rm = T),
            "Hypertension"=sum(HTN==2, na.rm = T),
            "Stroke"=sum(stroke==2, na.rm = T),
            "Angina"=sum(angnia==2, na.rm = T),
            "Congestive Heart Failure (CHF)"=sum(chf==2, na.rm = T),
            "Heart Attack"=sum(heart_attack==2, na.rm = T),
            "Cholesterol"=sum(Cholesterol==2, na.rm = T),
            "PCOS"=sum(pos_recode==0, na.rm = T),
            "PCOS - EMR"=sum(emr_pos==0, na.rm = T),
            "Undescended Testes"=sum(undescended==2, na.rm = T),
            "Emphysema"=sum(emphysema==2, na.rm = T),
            "Autism Spectrum Disorder - EMR"=sum(emr_asd==0, na.rm = T),
            "Overweight"=sum(Overweight==2, na.rm = T),
            "Obesity - EMR"=sum(emr_obesity==0, na.rm = T),
            "Dental Health"=sum((Dental_health==1|Dental_health==2|Dental_health==3), na.rm = T))

Control_Count <- t(Control_Count)
colnames(Control_Count) <- c("Controls")

Case_Control <- cbind(Case_Count, Control_Count)
Case_Control <- as.data.frame(Case_Control)

Case_Control$Total <- Case_Control$Cases+Case_Control$Controls

kable(Case_Control)
Cases Controls Total
Arthritis 70 243 313
Cancer 23 293 316
Cancer - EMR 27 225 252
Goiter 9 298 307
Goiter - EMR 5 247 252
Thyroid Disease 142 176 318
Thyroid Disease - EMR 80 172 252
Hypothyroidism 92 201 293
Hypothyroidism - EMR 116 136 252
Hyperthyroidism 26 265 291
Hyperthyroidism - EMR 5 247 252
Thyroid Cancer 6 305 311
Thyroid Cancer - EMR 5 247 252
Diabetes Mellitus 44 137 181
Diabetes Mellitus - EMR 54 198 252
Gout 14 298 312
Congenital Heart Disease (CHD) 6 309 315
Weak Kidney 14 302 316
Hypertension 106 213 319
Stroke 7 309 316
Angina 8 307 315
Congestive Heart Failure (CHF) 7 307 314
Heart Attack 10 307 317
Cholesterol 133 182 315
PCOS 17 299 316
PCOS - EMR 2 250 252
Undescended Testes 2 307 309
Emphysema 4 311 315
Autism Spectrum Disorder - EMR 0 252 252
Overweight 165 157 322
Obesity - EMR 78 173 251
Dental Health 159 158 317

Note from Frank on 9/3/2022:

For the metals analysis, the health outcomes don’t look to me like outcomes likely to be related to metals exposure. It’s worthwhile to graph them to see if anything comes out, but I don’t think it will.

A couple of caveats: Research from laboratory animal studies show relationships between contaminant exposures and autism. I don’t think we have a big enough sample size to detect this, but it’s worth looking at.

I think you should compare our PC scores to BMI. We measured BMI so that should be available for every participant. Many contaminants are obesogenic.

Can we pull out the motor dysfunction health outcomes from our health survey and electronic medical records? I’m thinking of diseases such as Parkinson’s or anything else with a motor deficit. These can be caused by high exposure to metals.

Graphical Visualizations

Merging PCA with Survey and Health Data

colnames(metals.robpca) <- c("ID", "Ln_Mn55", "Ln_Cu65", "Ln_Cd111", "Ln_Hg202", "Ln_Pb208", "Ln_U238", "Site", "PC1", "PC2", "PC3", "PC4", "PC5")

Metals <- merge(Metals, metals.robpca, by=c("ID", "Site"))

colnames(Metals) #will need to check to see if any of the col names need to change (e.g. instead of PC1 it is listed as PC1.x or PC1.y).
##   [1] "ID"                        "Site"                     
##   [3] "Mn55"                      "Cu65"                     
##   [5] "Cd111"                     "Hg202"                    
##   [7] "Pb208"                     "U238"                     
##   [9] "Gender"                    "DOB_month"                
##  [11] "DOB_day"                   "DOB_year"                 
##  [13] "Ethnicity"                 "Race_ai_an"               
##  [15] "Race_asian"                "Race_black"               
##  [17] "Race_pac_isl"              "Race_white"               
##  [19] "Race_other"                "Race_dk"                  
##  [21] "Race_refuse"               "Race_other_TEXT"          
##  [23] "Marital_status"            "Num_in_household"         
##  [25] "Num_children"              "Child_under_18"           
##  [27] "Education"                 "Education_TEXT"           
##  [29] "Birth_country"             "Birth_country_TEXT"       
##  [31] "Armed_forces"              "Army"                     
##  [33] "Navy"                      "Marine_corp"              
##  [35] "Air_force"                 "Coast_guard"              
##  [37] "Other"                     "Armed_forces_dk"          
##  [39] "Armed_forces_refuse"       "Armed_forces_TEXT"        
##  [41] "Work_for_pay"              "Self_employed"            
##  [43] "Looking"                   "Laid_off"                 
##  [45] "Retired"                   "Homemaker"                
##  [47] "Student"                   "Disabled"                 
##  [49] "Maternity_sick_leave"      "employ_other"             
##  [51] "employ_other_A"            "employ_dk"                
##  [53] "employ_TEXT"               "Occupation"               
##  [55] "Farmer"                    "Pesticides"               
##  [57] "Lived_in_Yuma"             "Born_in_Yuma"             
##  [59] "Yuma_pregnant_mom"         "Residence_type"           
##  [61] "Residence_TEXT"            "Own_rent"                 
##  [63] "Own_rent_TEXT"             "Homeless"                 
##  [65] "Water_home"                "Water_home_TEXT"          
##  [67] "Water_cooking"             "Water_cooking_TEXT"       
##  [69] "Water_coffee"              "Water_coffee_TEXT"        
##  [71] "Water_work"                "Water_work_TEXT"          
##  [73] "Health_care"               "Health_care_source"       
##  [75] "Health_care_source_TEXT"   "Healthcare_lastyear"      
##  [77] "Health_consult"            "Health_consult_place"     
##  [79] "Health_consult_place_TEXT" "Routine_care"             
##  [81] "Routine_care_TEXT"         "Doctor_time"              
##  [83] "ER_visits"                 "afford_rx_meds"           
##  [85] "afford_mh_care"            "afford_dental_care"       
##  [87] "afford_eyeglasses"         "afford_specialist"        
##  [89] "afford_follow_up"          "Delay_telephone"          
##  [91] "Delay_appt"                "Delay_wait"               
##  [93] "Delay_open"                "Delay_transportation"     
##  [95] "Mental_health"             "General_health"           
##  [97] "Health_compare"            "Height_ft"                
##  [99] "Height_inch"               "Weight"                   
## [101] "Overweight"                "Dental_visit"             
## [103] "Dental_health"             "arthritis"                
## [105] "gout"                      "chf"                      
## [107] "chd"                       "angnia"                   
## [109] "heart_attack"              "stroke"                   
## [111] "emphysema"                 "thyroid_prob"             
## [113] "hyperthyroid"              "hypothyroid"              
## [115] "thyroid_cancer"            "goiter"                   
## [117] "weak_kidney"               "cancer"                   
## [119] "pos"                       "undescended"              
## [121] "ambiguous"                 "HTN"                      
## [123] "HTN_hx_meds"               "HTN_current_med"          
## [125] "Cholesterol"               "Chol_med"                 
## [127] "Diabetes"                  "DM_age"                   
## [129] "DM_age_TEXT"               "insulin"                  
## [131] "diabetic_pills"            "Income"                   
## [133] "age"                       "thyroid_prob_D"           
## [135] "Hyper_D"                   "Hypo_D"                   
## [137] "Thyroid_cancer_D"          "Goiter_D"                 
## [139] "thyroid_conditions"        "clinical"                 
## [141] "endo"                      "fT3"                      
## [143] "TSH"                       "fT4"                      
## [145] "TotalT4"                   "TotalT3"                  
## [147] "Cortisol"                  "latitude"                 
## [149] "longitude"                 "emr_height_inches"        
## [151] "emr_weight"                "emr_bmi"                  
## [153] "emr_thyroid_problem"       "emr_hyperthyroidism"      
## [155] "emr_hypothyroidism"        "emr_thyroid_cancer"       
## [157] "emr_goiter"                "emr_cancer"               
## [159] "emr_pos"                   "emr_diabetes"             
## [161] "emr_obesity"               "sc"                       
## [163] "emr_sud"                   "emr_asd"                  
## [165] "meds"                      "meds_name"                
## [167] "thyroid_med"               "free_t3_med"              
## [169] "tsh_med"                   "total_t4_med"             
## [171] "free_t4_med"               "total_t3_med"             
## [173] "cortisol_med"              "Fe57"                     
## [175] "As75"                      "height_total"             
## [177] "bmi"                       "age_group"                
## [179] "Site_Collapsed"            "hyper_recode"             
## [181] "hypo_recode"               "thy_cancer_recode"        
## [183] "goiter_recode"             "pos_recode"               
## [185] "dm_recode"                 "emr_dm_recode"            
## [187] "sud"                       "tobacco_history"          
## [189] "number_years"              "pack_per_day"             
## [191] "quit_date"                 "tremor"                   
## [193] "FAC1_1"                    "FAC2_1"                   
## [195] "FAC3_1"                    "FAC4_1"                   
## [197] "FAC5_1"                    "Date"                     
## [199] "DOB"                       "Ln_Mn55"                  
## [201] "Ln_Cu65"                   "Ln_Cd111"                 
## [203] "Ln_Hg202"                  "Ln_Pb208"                 
## [205] "Ln_U238"                   "PC1"                      
## [207] "PC2"                       "PC3"                      
## [209] "PC4"                       "PC5"

BMI / Weight

#BMI looks to be skewed (lognormally distributed)
hist(log(Metals$bmi))

hist(log(Metals$emr_bmi))

ggplot(Metals, aes(x=log(bmi), y=PC1))+
  geom_point(aes(color=as.factor(Site)))+
  geom_smooth(method=lm, se=F)+
  scale_color_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
  xlab("\nBMI")+
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 8 rows containing non-finite values (stat_smooth).
## Warning: Removed 8 rows containing missing values (geom_point).

cor.test(Metals$PC1, log(Metals$bmi), method = "pearson") #is this the correct test to use?
## 
##  Pearson's product-moment correlation
## 
## data:  Metals$PC1 and log(Metals$bmi)
## t = 0.98045, df = 196, p-value = 0.3281
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.07026461  0.20728357
## sample estimates:
##        cor 
## 0.06986136
ggplot(Metals, aes(x=log(bmi), y=PC2))+
  geom_point(aes(color=as.factor(Site)))+
  geom_smooth(method=lm, se=F)+
  scale_color_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
  xlab("\nBMI")+
  theme_bw()
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 8 rows containing non-finite values (stat_smooth).
## Removed 8 rows containing missing values (geom_point).

cor.test(Metals$PC2, log(Metals$bmi), method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  Metals$PC2 and log(Metals$bmi)
## t = 1.643, df = 196, p-value = 0.102
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.0232653  0.2519017
## sample estimates:
##       cor 
## 0.1165543
#Looking at binary variables for weight / obesity

ggplot(subset(Metals, Overweight==1|Overweight==2), aes(x=as.factor(Overweight), y=PC1, fill=as.factor(Site)))+
  geom_boxplot()+
  scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
  scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
  xlab("\nOverweight by Site")+
  theme_bw()

ggplot(subset(Metals, Overweight==1|Overweight==2), aes(x=as.factor(Overweight), y=PC1))+
  geom_boxplot()+
  scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
  xlab("\nOverweight")+
  theme_bw()

ggplot(subset(Metals, Overweight==1|Overweight==2), aes(x=as.factor(Overweight), y=PC2, fill=as.factor(Site)))+
  geom_boxplot()+
  scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
  scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
  xlab("\nOverweight by Site")+
  theme_bw()

ggplot(subset(Metals, Overweight==1|Overweight==2), aes(x=as.factor(Overweight), y=PC2))+
  geom_boxplot()+
  scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
  xlab("\nOverweight")+
  theme_bw()

ggplot(subset(Metals, emr_obesity==1|emr_obesity==0), aes(x=as.factor(emr_obesity), y=PC1, fill=as.factor(Site)))+
  geom_boxplot()+
  scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
  scale_x_discrete(labels=c("1"="Yes", "0"="No"))+
  xlab("\nObesity (EMR)")+
  theme_bw()

ggplot(subset(Metals, emr_obesity==1|emr_obesity==0), aes(x=as.factor(emr_obesity), y=PC2, fill=as.factor(Site)))+
  geom_boxplot()+
  scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
  scale_x_discrete(labels=c("1"="Yes", "0"="No"))+
  xlab("\nObesity (EMR)")+
  theme_bw()

Arthritis

ggplot(subset(Metals, arthritis==1|arthritis==2), aes(x=as.factor(arthritis), y=PC1, fill=as.factor(Site)))+
  geom_boxplot()+
  scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
  scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
  xlab("\nArthritis by Site")+
  theme_bw()

ggplot(subset(Metals, arthritis==1|arthritis==2), aes(x=as.factor(arthritis), y=PC1))+
  geom_boxplot()+
  scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
  xlab("\nArthritis")+
  theme_bw()

ggplot(subset(Metals, arthritis==1|arthritis==2), aes(x=as.factor(arthritis), y=PC2, fill=as.factor(Site)))+
  geom_boxplot()+
  scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
  scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
  xlab("\nArthritis")+
  theme_bw()

Cancer

ggplot(subset(Metals, cancer==1|cancer==2), aes(x=as.factor(cancer), y=PC1, fill=as.factor(Site)))+
  geom_boxplot()+
  scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
  scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
  xlab("\nCancers")+
  theme_bw()

ggplot(subset(Metals, cancer==1|cancer==2), aes(x=as.factor(cancer), y=PC2, fill=as.factor(Site)))+
  geom_boxplot()+
  scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
  scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
  xlab("\nCancer")+
  theme_bw()

Diabetes Mellitus

ggplot(subset(Metals, dm_recode==1|dm_recode==0), aes(x=as.factor(dm_recode), y=PC1, fill=as.factor(Site)))+
  geom_boxplot()+
  scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
  scale_x_discrete(labels=c("1"="Yes", "0"="No"))+
  xlab("\nDiabetes Mellitus")+
  theme_bw()

ggplot(subset(Metals, dm_recode==1|dm_recode==0), aes(x=as.factor(dm_recode), y=PC2, fill=as.factor(Site)))+
  geom_boxplot()+
  scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
  scale_x_discrete(labels=c("1"="Yes", "0"="No"))+
  xlab("\nDiabetes Mellitus")+
  theme_bw()

Cholesterol

ggplot(subset(Metals, Cholesterol==1|Cholesterol==2), aes(x=as.factor(Cholesterol), y=PC1, fill=as.factor(Site)))+
  geom_boxplot()+
  scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
  scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
  xlab("\nCholesterol")+
  theme_bw()

ggplot(subset(Metals, Cholesterol==1|Cholesterol==2), aes(x=as.factor(Cholesterol), y=PC2, fill=as.factor(Site)))+
  geom_boxplot()+
  scale_fill_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
  scale_x_discrete(labels=c("1"="Yes", "2"="No"))+
  xlab("\nCholesterol")+
  theme_bw()

Dental Health

ggplot(subset(Metals, Dental_health==1|Dental_health==2|Dental_health==3|Dental_health==4|Dental_health==5), aes(x=as.factor(Dental_health), y=PC1))+
  geom_boxplot()+
  scale_x_discrete(labels=c("1"="Excellent", "2"="Very Good", "3"="Good", "4"="Fair", "5"="Poor"))+
  xlab("\nDental Health")+
  theme_bw()

ggplot(subset(Metals, Dental_health==1|Dental_health==2|Dental_health==3|Dental_health==4|Dental_health==5), aes(x=as.factor(Dental_health), y=PC2))+
  geom_boxplot()+
  scale_x_discrete(labels=c("1"="Excellent", "2"="Very Good", "3"="Good", "4"="Fair", "5"="Poor"))+
  xlab("\nDental Health")+
  theme_bw()

An Analysis of PC1 and PC2 Compared to Yuma Health Outcomes

\(Note:\) The following scripts must be run prior to completing this analysis: Data_Clean.Rmd, Yuma_Metals_Analysis.Rmd, Demographics.Rmd, yuma-metals-20210821.Rmd, and Exploration.Rmd. This will provide the accurate dataframes used in the following code.

Data Descriptions

#create_report(Metals) (Remove the # when you actually want to run - takes too much processing power otherwise.)

Data Visualization - Exploratory

Concentrations by Date

#There is a single data point for the year 2013, with the rest being April-November 2018.
#This is likely either a mistake in input or some other error, but I will not include this point in subsequent analysis.
#The concentration values for this single row are blank anyway.
ggplot(Metals, aes(y=PC1, x=Date, color=as.factor(Site)))+
  geom_point()+
  geom_smooth(se=F)+
  scale_color_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
  theme(panel.background = element_rect(fill="#ffffff"),
        axis.text.x=element_text(angle=60, hjust=1),
        axis.line = element_line(color="#000000"))+
  scale_x_date(date_breaks = "1 month", date_labels =  "%b")+
  ylab("PC1")+
  xlab("Date of Sample Collection")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Warning: Removed 4 rows containing missing values (geom_point).

ggplot(Metals, aes(y=PC2, x=Date, color=as.factor(Site)))+
  geom_point()+
  geom_smooth(se=F)+
  scale_color_discrete("Site", labels=c("1"="CSF", "2"="RCBH", "3"="YRMC"))+
  theme(panel.background = element_rect(fill="#ffffff"),
        axis.text.x=element_text(angle=60, hjust=1),
        axis.line = element_line(color="#000000"))+
  scale_x_date(date_breaks = "1 month", date_labels =  "%b")+
  ylab("PC2")+
  xlab("Date of Sample Collection")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## Warning: Removed 4 rows containing non-finite values (stat_smooth).
## Removed 4 rows containing missing values (geom_point).

#Relevant paper: https://www.sciencedirect.com/science/article/pii/S1878535212000263 

Tables and Observations for Each Metal

Metals %>% 
  group_by(Site) %>% 
  summarize(number_NAs_PC1=sum(is.na(PC1)),
            number_NAs_PC2=sum(is.na(PC2)))
## # A tibble: 3 × 3
##    Site number_NAs_PC1 number_NAs_PC2
##   <dbl>          <int>          <int>
## 1     1              0              0
## 2     2              0              0
## 3     3              0              0
#Making a table re: PC values
knitr::kable(Metals %>%
               group_by(Site) %>% 
               summarize(n_PC1=sum(!is.na(PC1)),
                         geomean_PC1=round(exp(mean(PC1, na.rm = T)), digits = 2),
                         geosd_PC1=round(exp(sd(PC1, na.rm = T)), digits = 2),
                         n_PC2=sum(!is.na(PC2)),
                         geomean_PC2=round(exp(mean(PC2, na.rm = T)), digits = 2),
                         geosd_PC2=round(exp(sd(PC2, na.rm = T)), digits = 2)))
Site n_PC1 geomean_PC1 geosd_PC1 n_PC2 geomean_PC2 geosd_PC2
1 27 1.11 2.32 27 1.31 1.41
2 101 1.56 2.55 101 1.08 1.54
3 78 0.79 1.90 78 0.76 1.74
#create_report(data.melt) (Only run if you need to.)

Distribution of PCA Data

hist(Metals$PC1)

hist(Metals$PC2)

Plotting Hormone Levels Against Metal Concentrations

#Cortisol against PCA results
PC1_Cortisol <- ggplot(Metals, aes(x=PC1, y=Cortisol))+geom_point()
PC2_Cortisol <- ggplot(Metals, aes(x=PC2, y=Cortisol))+geom_point()

grid.arrange(PC1_Cortisol, PC2_Cortisol)
## Warning: Removed 6 rows containing missing values (geom_point).
## Removed 6 rows containing missing values (geom_point).